QC across assays & MAE augmentation

Author

Laura Symul

Published

May 30, 2025

Code
library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)
library(MultiAssayExperiment)

tmp <- fs::dir_map("R scripts/", source)

theme_set(theme_light())

Loading the MultiAssayExperiment object

Code
data_source <- "real"
mae_files <- 
  fs::dir_ls(
    str_c(
      get_data_dir(data_source = data_source),  
      "02 MAEs/"
    ),
    regexp = "mae_full_.*\\.rds$"
  ) |> 
  sort(decreasing = TRUE) |>
  magrittr::extract(1)

mae <- readRDS(mae_files)

rm(mae_files)

Cross-assay QC

Comparing qPCR and metagenomics

Code
tmp <- 
  dplyr::inner_join(
    mae[["mg"]] |> 
      as_tibble() |>
      filter(!is.na(LBP)) |> 
      filter(!is.na(rel_abs_bact)) |> 
      select(.feature, .sample, rel_abs_bact, sample_type) |> 
      dplyr::rename(
        mg_rel_ab_bact = rel_abs_bact
      ),
    mae[["qPCR"]] |>
      as_tibble() |>
      filter(!is.na(LBP)) |> 
      select(.feature, .sample, copies_per_swab_med, copies_per_swab_cv, strain_group_qpcr),
    by = join_by(.feature, .sample)
  )

tmp |> 
  ggplot() +
  aes(x = mg_rel_ab_bact, y = copies_per_swab_med, color = sample_type) +
  geom_point(alpha = 0.3, size = 1) +
  facet_wrap(~ strain_group_qpcr + .feature) +
  scale_y_log10() +
  scale_x_log10()

Code
tmp |> 
  dplyr::count(.feature, mg_rel_ab_bact > 0, copies_per_swab_med > 0) |> 
  ggplot() +
  aes(x = `mg_rel_ab_bact > 0`, y = `copies_per_swab_med > 0`) +
  geom_tile(aes(fill = n)) +
  geom_text(aes(label = n)) +
  scale_fill_gradient(low = "white", high = "dodgerblue3") +
  facet_wrap(~ .feature) 

Code
tmp |> 
  dplyr::count(.feature, mg_rel_ab_bact > 0, copies_per_swab_med > 1e+05) |> 
  ggplot() +
  aes(x = `mg_rel_ab_bact > 0`, y = `copies_per_swab_med > 1e+05`) +
  geom_tile(aes(fill = n)) +
  geom_text(aes(label = n)) +
  scale_fill_gradient(low = "white", high = "dodgerblue3") +
  facet_wrap(~ .feature) 

Code
tmp |> 
  dplyr::count(.feature, mg_rel_ab_bact > 0, copies_per_swab_med > 10000) |> 
  ggplot() +
  aes(x = `mg_rel_ab_bact > 0`, y = `copies_per_swab_med > 10000`) +
  geom_tile(aes(fill = n)) +
  geom_text(aes(label = n)) +
  scale_fill_gradient(low = "white", high = "dodgerblue3") +
  facet_wrap(~ .feature) 

TODO sensitivity-specificity curves for each LBP

Metagenomics and 16S rRNA amplicon sequencing

TODO

Temporal profiles (for 10 “random” participants)

Weekly profiles

Code
weekly_samples_from_randomized_participants <- 
  dplyr::inner_join(
    tibble(uid = mae[["mg"]]$uid),
    tibble(uid = mae[["amplicon"]]$uid),
    by = join_by(uid)
  ) |> 
  dplyr::inner_join(
    mae@colData |> as.data.frame() |> as_tibble() |> 
      filter(randomized) |> 
      select(uid, pid, visit_code, randomized, arm, visit_type),
    by = join_by(uid)
  ) |> 
  filter(visit_type == "Clinic")
  

mae_sub <- mae[, weekly_samples_from_randomized_participants$uid]
Code
random_pids <- mae_sub$pid |> table() |> sort(decreasing = TRUE) |> names() |> head(10)
Code
map(
  random_pids,
  plot_data_for_pid,
  mae_sub = mae_sub
)
[[1]]


[[2]]


[[3]]


[[4]]


[[5]]


[[6]]


[[7]]


[[8]]


[[9]]


[[10]]

Full profiles

Code
all_samples_from_randomized_participants <- 
  dplyr::full_join(
    tibble(uid = mae[["mg"]]$uid),
    tibble(uid = mae[["amplicon"]]$uid),
    by = join_by(uid)
  ) |> 
  dplyr::inner_join(
    mae@colData |> as.data.frame() |> as_tibble() |> 
      filter(randomized) |> 
      select(uid, pid, visit_code, randomized, arm),
    by = join_by(uid)
  )

mae_sub <- mae[, all_samples_from_randomized_participants$uid]
Code
# random_pids <- mae_sub$pid |> table() |> sort(decreasing = TRUE) |> names() |> head(10)
Code
map(
  random_pids,
  plot_data_for_pid,
  mae_sub = mae_sub
)
[[1]]


[[2]]


[[3]]


[[4]]


[[5]]


[[6]]


[[7]]


[[8]]


[[9]]


[[10]]

Code
plot_data_for_pid(selected_pid = "068100046", mae_sub = mae_sub)

We rework this graph for all pids and save it as a png.

Code
plot_path <- str_c(
    get_data_dir(data_source = data_source),  
    "03 QCed MAEs/profiles_plot/")

all_pids <- unique(all_samples_from_randomized_participants$pid)

for (pid in all_pids) {
  p <- plot_data_for_pid(selected_pid = pid, mae_sub = mae_sub)
  
  file_name <- paste0("plot_", pid, ".png")
  
  ggsave(
    filename = str_c(plot_path, file_name),
    plot = p,
    width = 15, height = 8, dpi = 300
  )
}

Participant groups

“Guessing the arms” based on the MG and qPCR data

  • “LC-115” if at any visit, two-three LBP115-specific strains are detected >0

  • “LC-106-o” if in group 4

  • “LC-106-3 or -7” if at any visit, two LBP106-specific strains are detected >0

  • otherwise -> placebo

TODO

Primary outcomes

Primary outcome definition

The primary outcome is “colonization with any of the L. crispatus strains contained in the LBP by 5 weeks of follow-up as assessed by metagenomic sequencing of the vaginal microbial community with detection of any one of LBP strains at >5% relative abundance or a combination of the strains accounting for >10% relative abundance by metagenomics.”

By metagenomics

“LBP colonization” at each visit

We compute, at each visit and for each participant, the total relative abundance of all LBP strains or the maximum relative abundance of any LBP strain.

If the total relative abundance of all LBP strains is larger than 0.1 or the maximum relative abundance of any LBP strain is larger than 0.05, we consider that the LBP achieved colonization_mg at that visit.

Code
colonization_LBP_mg <- 
  mae[["mg"]] |> 
  as_tibble() |>
  filter(sample_type == "Clinical sample") |> 
  filter(!is.na(LBP)) |> 
  group_by(.sample) |> 
  summarize(
    n = n(),
    total_LBP_rel_ab_at = sum(rel_abs_bact),
    max_LBP_rel_ab_at = max(rel_abs_bact),
    n_detected_strains_at = sum(rel_abs_bact > 0),
    n_detected_LC106_strains_at = sum(rel_abs_bact[LBP == "LC-106 & LC-115"] > 0),
    n_detected_LC115_only_strains_at = sum(rel_abs_bact[LBP == "LC-115"] > 0)
  ) |> 
  mutate(
    colonized_LBP_at_mg = (total_LBP_rel_ab_at > 0.1) | (max_LBP_rel_ab_at > 0.05)
  ) 
Code
colonization_LBP_mg <-  
  colonization_LBP_mg |> 
  dplyr::left_join(
    mae@colData |> as_tibble() |> 
      select(uid, pid, visit_code, randomized, arm, location) |> 
      dplyr::rename(.sample = uid), 
    by = join_by(.sample)
  ) |> 
  arrange(pid, visit_code) 
Code
colonization_LBP_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = colonized_LBP_at_mg) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(    
    strip.text.y = element_text(angle = 0)
  )

“LBP colonization” by each visit

From the colonization_mg status at each visit, we can compute the colonization_mg status by each visit.

A participant is considered to have colonized by a visit if they have been colonized at that visit or any previous visit, starting from their post-product visit (“1200” for all groups).

Since we have some missing visits, we impute these as “not colonized”

Code
colonization_LBP_mg <- 
  colonization_LBP_mg |> 
  select(-c(arm, randomized, location)) |> 
  dplyr::full_join(
    colonization_LBP_mg |> select(pid, arm, randomized, location) |> distinct() |> 
      expand_grid(
        visit_code = unique(colonization_LBP_mg$visit_code) |> sort()
      ),
    by = join_by(pid, visit_code)
  )
Code
colonization_LBP_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = colonized_LBP_at_mg) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

Code
colonization_LBP_mg <- 
  colonization_LBP_mg |> 
  arrange(pid, visit_code) |>
  mutate(colonized_LBP_at_mg = colonized_LBP_at_mg |> replace_na(FALSE)) |>
  group_by(pid) |> 
  mutate(
    tmp = ifelse(as.numeric(visit_code) < 1200, FALSE, colonized_LBP_at_mg),
    colonized_LBP_by_mg = cummax(tmp) |> as.logical()
  ) |> 
  ungroup() |> 
  select(-tmp)
Code
colonization_LBP_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = colonized_LBP_by_mg) +
  geom_tile() +
  annotate(
    geom = "rect", 
    xmin = 10.5, xmax = 11.5, ymin = -Inf, ymax = Inf,
    col = "black", alpha = 0, linewidth = 1
  ) +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
  ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

“Crispatus colonization” at each visit

We compute, at each visit and for each participant, the total relative abundance of all LBP strains or the maximum relative abundance of any LBP strain.

If the total relative abundance of all LBP strains is larger than 0.1 or the maximum relative abundance of any LBP strain is larger than 0.05, we consider that the LBP achieved colonization_mg at that visit.

Code
colonization_crispatus_mg <- 
  mae[["mg"]] |> 
  as_tibble() |>
  filter(sample_type == "Clinical sample") |> 
  filter(species == "crispatus") |> 
  group_by(.sample) |> 
  summarize(
    total_crispatus_rel_ab_at = sum(rel_abs_bact),
    #max_crispatus_rel_ab_at = max(rel_abs_bact),
    #n_detected_crispatus_at = sum(rel_abs_bact > 0)
  ) |> 
  mutate(
    crispatus_dominance_at_mg = (total_crispatus_rel_ab_at > 0.5)
  ) 
Code
colonization_crispatus_mg <-  
  colonization_crispatus_mg |> 
  dplyr::left_join(
    mae@colData |> as_tibble() |> 
      select(uid, pid, visit_code, randomized, arm, location) |> 
      dplyr::rename(.sample = uid), 
    by = join_by(.sample)
  ) |> 
  arrange(pid, visit_code) 
Code
colonization_crispatus_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = crispatus_dominance_at_mg) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(    
    strip.text.y = element_text(angle = 0)
  )

“Crispatus colonization” by each visit

From the colonization_mg status at each visit, we can compute the colonization_mg status by each visit.

A participant is considered to have colonized by a visit if they have been colonized at that visit or any previous visit, starting from their post-product visit (“1200” for all groups).

Since we have some missing visits, we impute these as “not colonized”

Code
colonization_crispatus_mg <- 
  colonization_crispatus_mg |> 
  select(-c(arm, randomized, location)) |> 
  dplyr::full_join(
    colonization_crispatus_mg |> select(pid, arm, randomized, location) |> distinct() |> 
      expand_grid(
        visit_code = unique(colonization_crispatus_mg$visit_code) |> sort()
      ),
    by = join_by(pid, visit_code)
  )
Code
colonization_crispatus_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = crispatus_dominance_at_mg) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

Code
colonization_crispatus_mg <- 
  colonization_crispatus_mg |> 
  arrange(pid, visit_code) |>
  mutate(crispatus_dominance_at_mg = crispatus_dominance_at_mg |> replace_na(FALSE)) |>
  group_by(pid) |> 
  mutate(
    #tmp = ifelse(as.numeric(visit_code) < 1200, FALSE, crispatus_dominance_at_mg),
    crispatus_dominance_by_mg = cummax(crispatus_dominance_at_mg) |> as.logical()
  ) |> 
  ungroup() #|> 
  #select(-tmp)
Code
colonization_crispatus_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = crispatus_dominance_by_mg) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
  ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

LBP colonization VS crispatus dominance

Code
p1 <- colonization_LBP_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = colonized_LBP_at_mg) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

p2 <- colonization_crispatus_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = crispatus_dominance_at_mg) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

 p1 + p2 + 
    plot_annotation(
      title = str_c("LBP colonization VS crispatus dominance")
    ) +
    plot_layout(ncol = 2) &
    theme(
      legend.justification = "left"
    )

Code
p1 <- colonization_LBP_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = colonized_LBP_by_mg) +
  geom_tile() +
  annotate(
    geom = "rect", 
    xmin = 10.5, xmax = 11.5, ymin = -Inf, ymax = Inf,
    col = "black", alpha = 0, linewidth = 1
  ) +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
  ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

p2 <- colonization_crispatus_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = crispatus_dominance_by_mg) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
  ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

p1 + p2 + 
    plot_annotation(
      title = str_c("LBP colonization VS crispatus dominance")
    ) +
    plot_layout(ncol = 2) &
    theme(
      legend.justification = "left"
    )

By qPCR

“LBP colonization” at each visit

We compute, at each visit and for each participant, the total relative abundance of all LBP strains or the maximum relative abundance of any LBP strain.

If the total relative abundance of all LBP strains is larger than 0.1 or the maximum relative abundance of any LBP strain is larger than 0.05, we consider that the LBP achieved colonization at that visit.

Code
colonization_qPCR <- 
  mae[["qPCR"]] |> 
  as_tibble() |>
  filter(sample_type == "Clinical sample") |> 
  filter(!is.na(LBP)) |> 
  group_by(.sample) |> 
  summarize(
    n = n(),
    total_load_at = sum(copies_per_swab_med),
    n_detected_strains_at = sum(copies_per_swab_med > 0),
    n_detected_LC106_strains_at = sum(copies_per_swab_med[LBP == "LC-106 & LC-115"] > 0),
    n_detected_LC115_only_strains_at = sum(copies_per_swab_med[LBP == "LC-115"] > 0)
  ) |> 
  mutate(
    detected_at_qpcr = total_load_at > 0
  ) 
Code
colonization_qPCR <-  
  colonization_qPCR |> 
  dplyr::left_join(
    mae@colData |> as_tibble() |> 
      select(uid, pid, visit_code, randomized, arm, location) |> 
      dplyr::rename(.sample = uid), 
    by = join_by(.sample)
  ) |> 
  arrange(pid, visit_code) 
Code
colonization_qPCR |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = detected_at_qpcr) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(    
    strip.text.y = element_text(angle = 0)
  )

“LBP colonization” by each visit

From the colonization status at each visit, we can compute the colonization status by each visit.

A participant is considered to have colonized by a visit if they have been colonized at that visit or any previous visit, starting from their post-product visit (“1200” for all groups).

Since we have some missing visits, we impute these as “not colonized”

Code
colonization_qPCR <- 
  colonization_qPCR |> 
  select(-c(arm, randomized, location)) |> 
  dplyr::full_join(
    colonization_qPCR |> select(pid, arm, randomized, location) |> distinct() |> 
      expand_grid(
        visit_code = unique(colonization_qPCR$visit_code) |> sort()
      ),
    by = join_by(pid, visit_code)
  )
Code
colonization_qPCR |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = detected_at_qpcr) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

Code
colonization_qPCR <- 
  colonization_qPCR |> 
  arrange(pid, visit_code) |>
  mutate(colonized_at_qpcr = detected_at_qpcr |> replace_na(FALSE)) |>
  group_by(pid) |> 
  mutate(
    tmp = ifelse(as.numeric(visit_code) < 1200, FALSE, colonized_at_qpcr),
    detected_by_qpcr = cummax(tmp) |> as.logical()
  ) |> 
  ungroup() |> 
  select(-tmp)
Code
colonization_qPCR |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = detected_by_qpcr) +
  geom_tile() +
  annotate(
    geom = "rect", 
    xmin = 10.5, xmax = 11.5, ymin = -Inf, ymax = Inf,
    col = "black", alpha = 0, linewidth = 1
  ) +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
  ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

Colonization qPCR VS metagenomics

Code
p1 <- colonization_LBP_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = colonized_LBP_at_mg) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

p2 <- colonization_qPCR |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = detected_by_qpcr) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

p1 + p2 + 
    plot_annotation(
      title = str_c("LBP colonization at metagenomics VS qPCR")
    ) +
    plot_layout(ncol = 2) &
    theme(
      legend.justification = "left"
    )

Code
p1 <- colonization_LBP_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = colonized_LBP_by_mg) +
  geom_tile() +
  annotate(
    geom = "rect", 
    xmin = 10.5, xmax = 11.5, ymin = -Inf, ymax = Inf,
    col = "black", alpha = 0, linewidth = 1
  ) +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
  ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )


p2 <- colonization_qPCR |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = detected_by_qpcr) +
  geom_tile() +
  annotate(
    geom = "rect", 
    xmin = 10.5, xmax = 11.5, ymin = -Inf, ymax = Inf,
    col = "black", alpha = 0, linewidth = 1
  ) +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
  ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

p1 + p2 + 
    plot_annotation(
      title = str_c("LBP colonization by metagenomics VS qPCR")
    ) +
    plot_layout(ncol = 2) &
    theme(
      legend.justification = "left"
    )

Adding primary_outcomes assay to MAE

Code
se_primary_col_LBP_mg <-
  SummarizedExperiment(
    assays = list(
      outcome = 
        colonization_LBP_mg |> 
        # filter(!is.na(pid)) |> 
        # mutate(.sample = str_c(pid, "_", visit_code)) |> 
        filter(!is.na(.sample)) |> 
        select(.sample, colonized_LBP_at_mg, colonized_LBP_by_mg) |> 
        as.data.frame() |> 
        column_to_rownames(".sample") |> t()
  ) 
)

se_primary_col_crisp_mg <-
  SummarizedExperiment(
    assays = list(
      outcome = 
        colonization_crispatus_mg |> 
        filter(!is.na(.sample)) |> 
        select(.sample, crispatus_dominance_at_mg, crispatus_dominance_by_mg) |> 
        as.data.frame() |> 
        column_to_rownames(".sample") |> t()
  ) 
)

se_primary_col_LBP_qPCR <-
  SummarizedExperiment(
    assays = list(
      outcome = 
        colonization_qPCR |> 
        filter(!is.na(.sample)) |> 
        select(.sample, detected_at_qpcr, detected_by_qpcr) |> 
        as.data.frame() |> 
        column_to_rownames(".sample") |> t()
    )
  ) 



mae <- c(mae, list(col_LBP_mg = se_primary_col_LBP_mg, 
                   col_crispatus_mg = se_primary_col_crisp_mg,
                   col_LBP_qPCR = se_primary_col_LBP_qPCR))

Saving the MultiAssayExperiment objects

We save the “master” MAE (with all assays); but if needed for publication, we can subset the MAE to only include the assays/data of interest.

Code
saveRDS(
  mae, 
  str_c(
    get_data_dir(data_source = data_source),  
    "03 QCed MAEs/",
    "mae_full_", today() |> str_remove_all("-"), ".rds"
  )
)